library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.7
## v tidyr 0.8.1 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
d_train <- read_csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_features_train.csv")
d_labels <- read_csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_labels_train.csv")
d_test <- read_csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_features_train.csv")
d_train$reanalysis_dew_point_temp_k <- d_train$reanalysis_dew_point_temp_k - 273.15
d_test$reanalysis_dew_point_temp_k <- d_test$reanalysis_dew_point_temp_k - 273.15
d_train$reanalysis_air_temp_k <- d_train$reanalysis_air_temp_k - 273.15
d_test$reanalysis_air_temp_k <- d_test$reanalysis_air_temp_k - 273.15
d_train$reanalysis_max_air_temp_k <- d_train$reanalysis_max_air_temp_k - 273.15
d_test$reanalysis_max_air_temp_k <- d_test$reanalysis_max_air_temp_k - 273.15
d_train$reanalysis_min_air_temp_k <- d_train$reanalysis_min_air_temp_k - 273.15
d_test$reanalysis_min_air_temp_k <- d_test$reanalysis_min_air_temp_k - 273.15
d_train$reanalysis_avg_temp_k <- d_train$reanalysis_avg_temp_k - 273.15
d_test$reanalysis_avg_temp_k <- d_test$reanalysis_avg_temp_k - 273.15
#!!!tdtr does not appear to be in Kelvin
# d_train$reanalysis_tdtr_k <- d_train$reanalysis_tdtr_k - 273.15
# d_test$reanalysis_tdtr_k <- d_test$reanalysis_tdtr_k - 273.15
summary(d_train$reanalysis_dew_point_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 16.49 20.97 22.49 22.10 23.31 25.30 10
summary(d_train$reanalysis_air_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.49 24.51 25.50 25.55 26.68 29.05 10
summary(d_train$reanalysis_max_air_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 24.65 27.85 29.25 30.28 32.35 40.85 10
summary(d_train$reanalysis_min_air_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.75 20.75 23.05 22.57 24.75 26.75 10
summary(d_train$reanalysis_avg_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.74 25.11 26.14 26.08 27.06 29.78 10
summary(d_train$reanalysis_tdtr_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.357 2.329 2.857 4.904 7.625 16.029 10
df <- merge(d_train, d_labels, by=c("city","year","weekofyear"))
sj <- df[df$city == "sj",]
iq <- df[df$city == "iq",]
Merge test and train set without the total_cases
df_all <- rbind(d_train,d_test)
iq_all <- df_all[df_all$city == 'iq', ]
sj_all <- df_all[df_all$city == 'sj', ]
This section includes importing the data, creating of new variables and establishing the dataframes for the initial analysis
library(skimr)
## Warning: package 'skimr' was built under R version 3.4.4
skimmed.sj <- skim_to_wide(sj[,c(-1,-4)])
## Warning: package 'bindrcpp' was built under R version 3.4.4
skimmed.sj
## # A tibble: 23 x 13
## type variable missing complete n mean sd p0 p25 p50
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 integ~ total_cas~ 0 936 936 " 34~ 51.38 0 " ~ " 1~
## 2 integ~ weekofyear 0 936 936 " 26~ 15.02 1 " 1~ " 2~
## 3 integ~ year 0 936 936 1998.~ " 5.~ 1990 "199~ "199~
## 4 numer~ ndvi_ne 191 745 936 " 0.0~ " 0.~ "-0.~ " 0.~ " 0.~
## 5 numer~ ndvi_nw 49 887 936 " 0.0~ " 0.~ "-0.~ " 0.~ " 0.~
## 6 numer~ ndvi_se 19 917 936 " 0.1~ " 0.~ -0.0~ " 0.~ " 0.~
## 7 numer~ ndvi_sw 19 917 936 " 0.1~ " 0.~ -0.0~ " 0.~ " 0.~
## 8 numer~ precipita~ 9 927 936 "35.4~ "44.~ " 0 ~ " 0 ~ "20.~
## 9 numer~ reanalysi~ 6 930 936 "26.0~ " 1.~ "22.~ "25.~ "26.~
## 10 numer~ reanalysi~ 6 930 936 "26.1~ " 1.~ "22.~ "25.~ "26.~
## # ... with 13 more rows, and 3 more variables: p75 <chr>, p100 <chr>,
## # hist <chr>
skimmed.iq <- skim_to_wide(iq[,c(-1,-4)])
skimmed.iq
## # A tibble: 23 x 13
## type variable missing complete n mean sd p0 p25 p50
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 integ~ total_cas~ 0 520 520 " 7~ 10.77 0 " ~ " ~
## 2 integ~ weekofyear 0 520 520 " 26~ 15.03 1 " 1~ " 2~
## 3 integ~ year 0 520 520 "2005~ " 2.~ 2000 2002~ "200~
## 4 numer~ ndvi_ne 3 517 520 " 0.2~ " 0.~ " 0.~ " 0.~ " 0.~
## 5 numer~ ndvi_nw 3 517 520 " 0.2~ " 0.~ " 0.~ " 0.~ " 0.~
## 6 numer~ ndvi_se 3 517 520 " 0.2~ " 0.~ " 0.~ " 0.~ " 0.~
## 7 numer~ ndvi_sw 3 517 520 " 0.2~ " 0.~ " 0.~ " 0.~ " 0.~
## 8 numer~ precipita~ 4 516 520 64.25 "35.~ " 0 ~ 39.11 60.47
## 9 numer~ reanalysi~ 4 516 520 24.72 " 1.~ "21.~ 23.94 24.67
## 10 numer~ reanalysi~ 4 516 520 25.98 " 1.~ "21.~ 25.07 25.97
## # ... with 13 more rows, and 3 more variables: p75 <chr>, p100 <chr>,
## # hist <chr>
skimmed.sj_all <- skim_to_wide(sj_all[,c(-1,-4)])
skimmed.sj_all
## # A tibble: 22 x 13
## type variable missing complete n mean sd p0 p25 p50
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 integ~ weekofyear 0 1872 1872 " 26~ 15.02 1 " 1~ " 2~
## 2 integ~ year 0 1872 1872 1998.~ " 5.~ 1990 "199~ "199~
## 3 numer~ ndvi_ne 382 1490 1872 " 0.0~ " 0.~ "-0.~ " 0.~ " 0.~
## 4 numer~ ndvi_nw 98 1774 1872 " 0.0~ " 0.~ "-0.~ " 0.~ " 0.~
## 5 numer~ ndvi_se 38 1834 1872 " 0.1~ " 0.~ -0.0~ " 0.~ " 0.~
## 6 numer~ ndvi_sw 38 1834 1872 " 0.1~ " 0.~ -0.0~ " 0.~ " 0.~
## 7 numer~ precipita~ 18 1854 1872 "35.4~ "44.~ " 0 ~ " 0 ~ "20.~
## 8 numer~ reanalysi~ 12 1860 1872 "26.0~ " 1.~ "22.~ "25.~ "26.~
## 9 numer~ reanalysi~ 12 1860 1872 "26.1~ " 1.~ "22.~ "25.~ "26.~
## 10 numer~ reanalysi~ 12 1860 1872 "21.9~ " 1.~ "16.~ "20.~ "22.~
## # ... with 12 more rows, and 3 more variables: p75 <chr>, p100 <chr>,
## # hist <chr>
skimmed.iq_all <- skim_to_wide(iq_all[,c(-1,-4)])
skimmed.iq_all
## # A tibble: 22 x 13
## type variable missing complete n mean sd p0 p25 p50
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 integ~ weekofyear 0 1040 1040 " 2~ 15.02 1 " 1~ " 2~
## 2 integ~ year 0 1040 1040 "200~ " 2.~ 2000 2002~ "200~
## 3 numer~ ndvi_ne 6 1034 1040 " 0.~ " 0.~ " 0.~ " 0.~ " 0.~
## 4 numer~ ndvi_nw 6 1034 1040 " 0.~ " 0.~ " 0.~ " 0.~ " 0.~
## 5 numer~ ndvi_se 6 1034 1040 " 0.~ " 0.~ " 0.~ " 0.~ " 0.~
## 6 numer~ ndvi_sw 6 1034 1040 " 0.~ " 0.~ " 0.~ " 0.~ " 0.~
## 7 numer~ precipitat~ 8 1032 1040 64.25 "35.~ " 0 ~ 39.11 60.47
## 8 numer~ reanalysis~ 8 1032 1040 24.72 " 1.~ "21.~ 23.94 24.67
## 9 numer~ reanalysis~ 8 1032 1040 25.98 " 1.~ "21.~ 25.07 25.97
## 10 numer~ reanalysis~ 8 1032 1040 22.34 " 1.~ "16.~ 21.44 "22.~
## # ... with 12 more rows, and 3 more variables: p75 <chr>, p100 <chr>,
## # hist <chr>
rm(skimmed.sj, skimmed.iq, skimmed.iq_all, skimmed.sj_all)
Clean up all the extra dataframes produced during the exploratory analysis
#rm(d_test,
# d_train,
# dengue_labels_train,
# sj_test,
# sj_features_train,
# sj_labels_train,
# iq_test,
# iq_features_train,
# iq_labels_train,
# df,
# iq,
# sj,
# df,
# submission_format
# )
In this section, we summary the value of the data frames (together and by city). We also create the following graphs
We can see that the same feature is significantly different in each city
cnames <- colnames(sj)
for (i in 5:(ncol(sj))){
wilt <- wilcox.test(sj[,i],iq[,i])
print(cnames[i])
print(wilt)
}
## [1] "ndvi_ne"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 21691, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "ndvi_nw"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 32596, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "ndvi_se"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 107990, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "ndvi_sw"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 78560, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "precipitation_amt_mm"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 118470, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_air_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 369950, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_avg_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 255790, p-value = 0.03716
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_dew_point_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 208230, p-value = 3.071e-05
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_max_air_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 4645.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_min_air_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 474700, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_precip_amt_kg_per_m2"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 139740, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_relative_humidity_percent"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 62770, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_sat_precip_amt_mm"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 118470, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_specific_humidity_g_per_kg"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 192510, p-value = 4.502e-10
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_tdtr_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 22, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_avg_temp_c"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 183690, p-value = 1.887e-08
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_diur_temp_rng_c"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 6834, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_max_temp_c"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 59998, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_min_temp_c"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 361870, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_precip_mm"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 142000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "total_cases"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj[, i] and iq[, i]
## W = 401310, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
rm(cnames, i, wilt)
Same as above but only for SJ.
cnames <- colnames(sj)
par(mfrow=c(1,2))
for (i in 5:ncol(sj)) {
hist(sj[,i],
breaks = 20,
xlab = cnames[i],
main = paste(cnames[i], sep = ": "))
}
rm(cnames, i)
Same as above but only for IQ.
cnames <- colnames(df)
par(mfrow=c(1,2))
for (i in 5:(ncol(df))) {
hist(df[df$city == "iq",i],
breaks = 20,
xlab = cnames[i],
main = paste("Freq Histogram for IQ", cnames[i], sep = ": "))
}
rm(cnames, i)
Includes all the data from test and training set by time for SJ therefore the total_cases in not included. Total_cases by time is done separately.
cnames <- colnames(sj_all)
par(mfrow=c(2,2))
for (i in 5:(ncol(sj_all))) {
gg1 <- ggplot(sj_all,
aes(x=week_start_date,
y = sj_all[,i])) +
geom_line() +
ylab(cnames[i]) +
ggtitle(paste(cnames[i]))
print(gg1)
}
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
rm(cnames, i, gg1)
Includes all the data from test and training set by time for I therefore the total_cases in not included. Total_cases by time is done separately.
cnames <- colnames(iq_all)
par(mfrow=c(2,2))
for (i in 5:(ncol(iq_all))) {
gg1 <- ggplot(iq_all,
aes(x=week_start_date,
y = iq_all[,i])) +
geom_line() +
ylab(cnames[i]) +
ggtitle(paste(cnames[i]))
print(gg1)
}
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
rm(cnames, i, gg1)
Includes all the data from test and training set by time for SJ therefore the total_cases in not included. Total_cases by time is done separately.
library(ggplot2)
cnames <- colnames(sj_all)
par(mfrow=c(2,2))
for (i in 5:(ncol(sj_all))) {
gg1 <- ggplot(sj,
aes(x=weekofyear,
y = sj_all[,i],
group = weekofyear)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,52,2)) +
ylab(cnames[i]) +
ggtitle(paste(cnames[i]))
print(gg1)
}
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 382 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 98 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 38 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 38 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 18 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 18 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
rm(cnames, i, gg1)
Includes all the data from test and training set by time for I therefore the total_cases in not included. Total_cases by time is done separately.
library(ggplot2)
cnames <- colnames(iq_all)
par(mfrow=c(2,2))
for (i in 5:(ncol(iq_all))) {
gg1 <- ggplot(iq_all,
aes(x=weekofyear,
y = iq_all[,i],
group = weekofyear)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,52,2)) +
ylab(cnames[i]) +
ggtitle(paste(cnames[i]))
print(gg1)
}
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 74 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 74 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).
## Don't know how to automatically pick scale for object of type tbl_df/tbl/data.frame. Defaulting to continuous.
## Warning: Removed 32 rows containing non-finite values (stat_boxplot).
rm(cnames, i, gg1)
Line graph of all data by total cases. This uses only the training set.
library(ggplot2)
par(mfcol=c(1,3))
#Dengue Cases for San Jose
ggplot(data = df[df$city == "sj",], aes(x=week_start_date, y=total_cases)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Total Dengue Cases in San Jose",
subtitle = paste(min(df$week_start_date[df$city == "sj"]),max(df$week_start_date[df$city == "sj"]), sep = " to "),
x = "Date", y = "Total dengue cases")
# Dengue Cases for Iquitos
ggplot(data = df[df$city == "iq",], aes(x=week_start_date, y=total_cases)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Total Dengue Cases in Iquitos",
subtitle = paste(min(df$week_start_date[df$city == "iq"]),max(df$week_start_date[df$city == "iq"]), sep = " to "),
x = "Date", y = "Total dengue cases")
Line graph of all data by total cases. This uses only the training set.
library(ggplot2)
gg1 <- ggplot(sj,
aes(x=weekofyear,
y = total_cases,
group = weekofyear)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,52,1)) +
stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") +
ylab("Total cases") +
ggtitle(paste("Boxplot: Total cases by Week for SJ"))
print(gg1)
gg3 <- ggplot(data=sj, aes(x=weekofyear, y=total_cases)) +
geom_bar(stat="summary", fun.y = "mean") +
ggtitle(paste("Bar graph: Average total cases by Week for SJ")) +
scale_x_continuous(breaks = seq(1,52, 2))
print(gg3)
gg2 <- ggplot(iq,
aes(x=weekofyear,
y = total_cases,
group = weekofyear)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,52,1)) +
stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") +
ylab("Total cases") +
ggtitle(paste("Boxplot: Total cases by Week for IQ"))
print(gg2)
gg4 <- ggplot(data=iq, aes(x=weekofyear, y=total_cases)) +
geom_bar(stat="summary", fun.y = "mean") +
ggtitle(paste("Bar graph: Average total cases by Week for IQ")) +
scale_x_continuous(breaks = seq(1,52, 2))
print(gg4)
rm(gg1, gg2, gg3, gg4)
Same as above but for SJ
cnames <- colnames(df)
par(mfrow=c(2,2))
for (i in 5:(ncol(df)-1)) {
plot(df$total_cases[df$city == "sj"],
df[df$city == "sj",i],
cex = 0.5,
pch = 19,
ylim = c(min(df[,i],na.rm=TRUE), max(df[,i],na.rm=TRUE)),
main = paste("Total_cases for SJ by climate variables", cnames[i], sep = ": "),
ylab = cnames[i])
}
rm(cnames, i)
Same as above but for IQ.
cnames <- colnames(df)
par(mfrow=c(2,2))
for (i in 5:(ncol(df)-1)) {
plot(df$total_cases[df$city == "iq"],
df[df$city == "iq",i],
cex = 0.5,
pch = 19,
ylim = c(min(df[,i],na.rm=TRUE), max(df[,i],na.rm=TRUE)),
main = paste("Total_cases for IQ by climate variables", cnames[i], sep = ": "),
ylab = cnames[i])
}
rm(cnames, i)
There are several variables which appear to be the same feature but taken from a different source. For example, station_precip_mm and precipitation_amt_mm and reanalysis_sat_precip_amt_mm all appear to be the same “Total Precipitation value” Only one should be kept if they are the same.
“station_max_temp_c”" and “reanalysis_max_air_temp_k” (scaled to Celcius)
library(ggplot2)
#generate a difference in max temp variable
sj$max_air_diff <- sj$station_max_temp_c - sj$reanalysis_max_air_temp_k
#barplot the difference by year
ggplot(sj,aes(x=year, y=max_air_diff))+
geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).
#box plot difference by year
ggplot(sj, aes(x=year, y = max_air_diff, group = year)) + geom_boxplot()
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
#Add month to the dataframe
sj$month <- as.POSIXlt(sj$week_start_date)$mon +1
#box plot difference by month
ggplot(sj, aes(x=month, y = max_air_diff, group = month)) + geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
sj$max_air_diff <- NULL
sj$month <- NULL
“station_min_temp_c”" and “reanalysis_min_air_temp_k” (scaled to Celcius)
library(ggplot2)
#generate a difference in max temp variable
sj$min_air_diff <- sj$station_min_temp_c - sj$reanalysis_min_air_temp_k
#barplot the difference by year
ggplot(sj,aes(x=year, y=min_air_diff))+
geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).
#box plot difference by year
ggplot(sj, aes(x=year, y = min_air_diff, group = year)) + geom_boxplot()
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
#Add month to the dataframe
sj$month <- as.POSIXlt(sj$week_start_date)$mon +1
#box plot difference by month
ggplot(sj, aes(x=month, y = min_air_diff, group = month)) + geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
sj$min_air_diff <- NULL
sj$month <- NULL
“station_avg_temp_c”" and “reanalysis_avg_temp_k” (scaled to Celcius)
library(ggplot2)
#generate a difference in max temp variable
sj$avg_air_diff <- sj$station_avg_temp_c - sj$reanalysis_avg_temp_k
#barplot the difference by year
ggplot(sj,aes(x=year, y=avg_air_diff))+
geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).
#box plot difference by year
ggplot(sj, aes(x=year, y = avg_air_diff, group = year)) + geom_boxplot()
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
#Add month to the dataframe
sj$month <- as.POSIXlt(sj$week_start_date)$mon +1
#box plot difference by month
ggplot(sj, aes(x=month, y = avg_air_diff, group = month)) + geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
sj$avg_air_diff <- NULL
sj$month <- NULL
“station_precip_mm”, “precipitation_amt_mm”, “reanalysis_sat_precip_amt_mm”, “reanalysis_precip_amt_kg_per_m2”
library(ggplot2)
precip <- c("station_precip_mm", "precipitation_amt_mm", "reanalysis_sat_precip_amt_mm", "reanalysis_precip_amt_kg_per_m2")
#Add month to the dataframe
sj$month <- as.POSIXlt(sj$week_start_date)$mon +1
for (i in 1:3){
par(mfrow=c(1,3))
#generate the first variable in the list
p1 <- precip[i]
ind1 <- which(colnames(sj)==p1)
for (j in ((i+1):4)){
#generate the next variable in the list
p2 <- precip[j]
ind2 <- which(colnames(sj)==p2)
#generate a difference variable
sj$diff <- sj[,ind1] - sj[,ind2]
#barplot the difference by year
gg1 <-ggplot(sj,
aes(x=year, y=diff))+
geom_bar(stat = "identity", fill="steelblue") +
ggtitle(paste(p1, "&", p2))
print(gg1)
#box plot the difference by year
gg2 <-ggplot(sj,
aes(x=year, y=diff, group = year)) +
geom_boxplot() +
ggtitle(paste(p1, "&", p2))
print(gg2)
#box plot difference by month
gg3 <- ggplot(sj,
aes(x=month, y = diff, group = month)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,12,1)) +
ggtitle(paste(p1, "&", p2))
print(gg3)
}
}
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (position_stack).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
sj$diff <- NULL
sj$month <- NULL
rm(gg1, gg2, gg3, i, ind1, ind2, j, p1, p2, precip)
Boxplot includes test and training set - NA still included
library(ggplot2)
cnames <- colnames(df)
for (i in 5:(ncol(df))) {
p <- ggplot(df, aes(x=city, y = df[,i], fill = city)) +
geom_boxplot() +
labs(title = "Boxplot of climate variables",
subtitle = cnames[i],
x = "City", y = cnames[i])
print(p)
}
## Warning: Removed 194 rows containing non-finite values (stat_boxplot).
## Warning: Removed 52 rows containing non-finite values (stat_boxplot).
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).
## Warning: Removed 13 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 13 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
## Warning: Removed 43 rows containing non-finite values (stat_boxplot).
## Warning: Removed 43 rows containing non-finite values (stat_boxplot).
## Warning: Removed 20 rows containing non-finite values (stat_boxplot).
## Warning: Removed 14 rows containing non-finite values (stat_boxplot).
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).
rm(cnames, i, p)
library(ggplot2)
ggplot(df, aes(x=city, y = total_cases, fill = city)) +
geom_boxplot() +
labs(title = "Boxplot of Total_cases",
x = "City", y = "Total_cases")